2017/05/10 by Andrew Chang
import pandas as pd
import numpy as np
from time import time
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display, Image
Image('~/F1.png')
Image('~/F2.png')
Image('~/F3.png')
Image('~/F4.png')
Image('~/F5.png')
Image('~/F6.png')
Image('~/F7.png')
Image('~/F8.png', width=500, height=500)
df = pd.read_csv('~/demo_seq.csv')
# check the size of the data
print "size of df:", df.shape
aaindex_7 = pd.read_csv('~/AAIndex_Andrew_choice.csv', header=0, index_col=0)
display(aaindex_7)
Turn strings into numerical numbers so that machine learning algorithm can handel them.
def generate_seq_ft_lb(s_data_num):
seqs_num = s_data_num.var_seq
# converting aa seq into number as shown in aa_label
seq_feature = []
for seq in seqs_num:
string = pd.Series(list(seq))
string.reset_index(drop=True, inplace=True) # reset the index for the next step
for index, charc in enumerate(string):
string[index] = aaindex_7.loc[charc,:].tolist()
seq_feature.append(string)
seq_feature = np.array(seq_feature)
seq_label = s_data_num.Label.reset_index(drop=True)
seq_label = np.array(seq_label).astype(int)
num_total_sample = seq_feature.shape[0]
num_aa = seq_feature.shape[1]
num_chemprop = seq_feature.shape[2]
print '%d samples. Each sample has %d features, with %d values per feature.' %(num_total_sample, num_aa, num_chemprop)
print '%d labels.' %(seq_label.shape[0])
#print "Feature of the original 1st data: \n", seq_feature[0]
## reshape it to each sample has 15*11 features. feature 0-10 is aa1, 11-21 is aa2...
seq_feature = seq_feature.reshape(num_total_sample,num_aa*num_chemprop)
print ""
print 'The shape of transformed features', seq_feature.shape
print "Feature of the original 1st data: \n", seq_feature[0]
print '====================================================================\n'
return(seq_feature, seq_label, num_aa, num_chemprop)
# feed in the data for encoding
seq_feature, seq_label, num_aa, num_chemprop = generate_seq_ft_lb(df)
from sklearn.model_selection import GridSearchCV, validation_curve, StratifiedShuffleSplit
from sklearn import metrics
from sklearn.metrics import classification_report
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) # n_splits=1 is equivalent to regular splitter
sss.get_n_splits(seq_feature, seq_label)
for train_index, test_index in sss.split(seq_feature, seq_label):
features_train, features_test = seq_feature[train_index], seq_feature[test_index]
labels_train, labels_test = seq_label[train_index], seq_label[test_index]
print "Shape of Feature Train:", features_train.shape
print "Shape of Feature Test :", features_test.shape
print "# Label Train:", len(labels_train)
print "# Label Test :", len(labels_test)
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from xgboost import plot_tree
xgbc = XGBClassifier(
max_depth=10, ## default = 6, range: [1,∞], higher is more likely to overfit
learning_rate =0.01, # decrease this needs to increase the n_estimator
n_estimators=2000, # higher value requires lower eta (n_estimators), = number of rounds of boosting (correcting error)
silent = True,
objective= 'binary:logistic', # binary:logistic, multi:softmax, multi:softprob
nthread=-1,
gamma=0.5, ## [0,∞]. Larger, harder to split and control model complexity
min_child_weight=6, ## [0,∞], default = 1, minimum sum of instance weight (hessian) needed in a child, the larger, the more conservative the algorithm will be.
max_delta_step = 5, ## [0,∞], default = 0, Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced. Set it to value of 1-10 might help control the update
subsample=0.6, ##(defalut = 1), [0,1] lower this will prevent overfitting
colsample_bytree=0.8, ## [0,1], subsample ratio of columns when constructing each tree; control overfitting
colsample_bylevel=1, # [0,1], subsample ratio of columns for each split, in each level
reg_alpha=0, # default = 0, L1 regularization term on weights, increase this value will make model more conservative
reg_lambda=1, # default = 1, L2 regularization term on weights, increase this value will make model more conservative.
scale_pos_weight=5, ## default = 1, Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: sum(negative cases) / sum(positive cases)
#base_score=0.5,
seed=0,
missing=None)
# put the feature and label and weight into xgb.DMatrix for xgb.cv
dtrain = xgb.DMatrix(features_train, labels_train)
dtest = xgb.DMatrix(features_test, labels_test)
# get the hyperparamters and add 'num_class' for xgb.cv
param = xgbc.get_params()
#param['num_class'] = len(np.unique(labels_11_train_ds))
# quick training without tuning hyper-parameters to see how good it is
t0 = time()
xgbc.fit(features_train, labels_train,
eval_set = [(features_test, labels_test)], eval_metric = 'auc',
early_stopping_rounds=30) #eval_set = watchlist, early_stopping_rounds=10
print "training time:", round(time()-t0, 3), "s" # time the training process
score = xgbc.score(features_test, labels_test)
print "AUC: ", round(score, 2) # print out the scores
print classification_report(labels_test, xgbc.predict(features_test))
Hydrophobicity @ position 4
Rigidity @ position 4
Surface Area @ position 6 & 20
Molecule Weight @ position 20
# print "important features are: \n", xgbc.feature_importances_.reshape([num_aa, num_chemprop])
imp_features = xgbc.feature_importances_.reshape([num_aa, num_chemprop])
imp_features = pd.DataFrame(imp_features).T
imp_features.index = aaindex_7.columns.tolist()
plt.rcParams['figure.figsize'] = (17, 3)
fig, ax1 = plt.subplots(nrows=1, ncols=1)
fig.suptitle('Importance of AA Positions and Chemical Properties')
sns.heatmap(imp_features); ax1.set_xlabel('amino acid position')
# labels_test = np.array(labels_test)
idx_test_0 = labels_test == 0
print "True Label - Good :", labels_test[idx_test_0][:10]
print "The prediction says:", xgbc.predict(features_test)[idx_test_0][:10]
print "The prediction prob:", xgbc.predict_proba(features_test)[idx_test_0][:10]
print ""
idx_test_1 = labels_test == 1
print "True Label - Bad :", labels_test[idx_test_1][:10]
print "The prediction says:", xgbc.predict(features_test)[idx_test_1][:10]
print "The prediction prob:", xgbc.predict_proba(features_test)[idx_test_1][:10]
# need to install graphviz @ terminal: brew install graphviz
#pdf_file = '~/xgbc_trees.pdf'
#plot_tree(xgbc, num_trees=0)
#plt.show()
gvfile = '~/xgbc_trees.gv'
dot_data = xgb.to_graphviz(xgbc, num_trees=0)
dot_data.render(gvfile, view=False)
dot_data
# GridSearch for n_estimators and learning_rate first
cv_sets = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=42)
parameters = {'n_estimators':[1000, 2000], 'learning_rate':[0.01, 0.05]}
grid = GridSearchCV(xgbc, parameters, cv=cv_sets, scoring = 'roc_auc', n_jobs=3) # 'f1' scoring
# grid.fit(features_wCID_11_train_ds, labels_11_train_ds)
grid.fit(features_train, labels_train)
print("The best parameters are %s with AUC of %0.2f" % (grid.best_params_, grid.best_score_))
print grid.best_estimator_